analysis_df =
  left_join(applicants, 
            survey_resp,
            by = "ROWNUM_ID") %>%
  #make failure (instead of success var)
  mutate(f1 = case_when(t1 == 0 ~ 1,
                        t1 == 1 ~ 0,
                        TRUE ~ NA_real_),
         f2 = case_when(t2 == 0 ~ 1,
                        t2 == 1 ~ 0,
                        TRUE ~ NA_real_),
         f3 = case_when(t3 == 0 ~ 1,
                        t3 == 1 ~ 0,
                        TRUE ~ NA_real_),
         forcing_f1 = forcing_t1*(-1),
         forcing_f2 = forcing_t2*(-1),
         forcing_f3 = forcing_t3*(-1),
         no_job = (got_job == 0)*1) %>%
  mutate(matriculant = case_when(q_10_survey_accept_pns == "Ya" ~ 1,
                                 q_10_survey_accept_pns == "Tidak" ~ 0,
                                 TRUE ~ NA_real_))

analysis_df <-
  analysis_df %>%
  mutate(java_indicator = (kab_code_residence > 3000 & kab_code_residence < 4000)*1,
         urb_rur = ((str_sub(kab_code_residence, 3, 4)) > 70)*1,
         islam = (RELIGION == 1)*1,
         selection_indicator = (!is.na(q1_consent))*1)


analysis_df_prediction <- 
  analysis_df %>%
  filter(!is.na(selection_indicator), !is.na(AGE), !is.na(GENDER), !is.na(urb_rur), !is.na(islam), !is.na(java_indicator))

ipw_mod <- glm(selection_indicator ~ AGE + factor(GENDER) + factor(urb_rur) + factor(islam) + factor(java_indicator), 
               data = analysis_df_prediction, 
               family = binomial(link = "logit"))

analysis_df_prediction <- 
  analysis_df_prediction %>%
  mutate(prob = predict(ipw_mod, type = 'response')) %>%
  mutate(invwt = selection_indicator/prob + (1-selection_indicator)/(1-prob))


weights_df <-
  analysis_df_prediction %>%
  select(ROWNUM_ID, invwt)


analysis_df_subset <-
  analysis_df %>%
  filter(!is.na(q1_consent)) %>%
  left_join(., weights_df)



#calculate the religious group score differences using the full data

library(Hmisc)

options(scipen = 999)
merge_scores_job_loc =
  analysis_df %>%
  filter(skd_score != 0) %>%
  group_by(kab_code_job_loc) %>%
  summarise(avg_muslim = mean(skd_score[RELIGION == 1], na.rm = T),
            avg_non_muslim = mean(skd_score[RELIGION != 1], na.rm = T)) %>%
  mutate(relg_score_diff = abs(avg_muslim - avg_non_muslim)) %>%
  select(kab_code_job_loc, relg_score_diff_job_loc = relg_score_diff) %>%
  mutate(quintile_job_loc = cut2(relg_score_diff_job_loc, g = 5, digits = 2)) %>%
  mutate(quintile_job_loc = str_remove_all(quintile_job_loc, "\\[|\\]|\\)|\\(")) %>%
  separate(quintile_job_loc, sep = "\\,", into = c("min", "max")) %>%
  mutate(min = round(as.numeric(min), digits = 1)) %>%
  mutate(label = paste0("[", min, ",", max, ")")) %>%
  filter(!is.na(min), !is.na(max)) %>%
  select(kab_code_job_loc, quintile_relg_diff_job_loc = label)


merge_scores_residence =
  analysis_df %>%
  filter(skd_score != 0) %>%
  group_by(kab_code_residence) %>%
  summarise(avg_muslim = mean(skd_score[RELIGION == 1], na.rm = T),
            avg_non_muslim = mean(skd_score[RELIGION != 1], na.rm = T)) %>%
  mutate(relg_score_diff = abs(avg_muslim - avg_non_muslim)) %>%
  select(kab_code_residence, relg_score_diff_residence = relg_score_diff) %>%
  mutate(quintile_residence = cut2(relg_score_diff_residence, g = 5, digits = 2)) %>%
  mutate(quintile_residence = str_remove_all(quintile_residence, "\\[|\\]|\\)|\\(")) %>%
  separate(quintile_residence, sep = "\\,", into = c("min", "max")) %>%
  mutate(min = round(as.numeric(min), digits = 1)) %>%
  mutate(label = paste0("[", min, ",", max, ")")) %>%
  filter(!is.na(min), !is.na(max)) %>%
  select(kab_code_residence, quintile_relg_diff_residence = label)


analysis_df_subset = 
  left_join(analysis_df_subset, merge_scores_job_loc) %>%
  left_join(., merge_scores_residence)




#select the variables we will use

analysis_df_subset <-
  analysis_df_subset %>%
  filter(q1_consent != "Tidak") %>%
  dplyr::select(
    
    ROWNUM_ID, AGE, GENDER, RELIGION, q5_survey_ethnic, java_indicator, urb_rur, islam, invwt, q1_consent, instansi_category,
    
    skd_score, forcing_final = forcing_t1, 
    pass_final = t1, forcing_skd = forcing_t3, fail_skd = f3, got_job, matriculant,
    
    q7_survey_study_type, q8_survey_study_length, 
    
    q13_survey_success_reason_1_merit, q13_survey_success_reason_2_connection, q13_survey_success_reason_3_sara, 
    q14_survey_test_vs_connection, q15_survey_test_java, q15_survey_test_muslim,
    q16_survey_transparent, q17_survey_java1, q17_survey_java2, q17_survey_java3,
    q18_survey_daerah1, q18_survey_daerah2, q18_survey_daerah3, q19_survey_relg1,
    q19_survey_relg2, q19_survey_relg3, q20_survey_pancasila_h3, q21_survey_nation_ethnic_h3, 
    q22_survey_working, q23_survey_look_work, q26_survey_income, q27_survey_job_satis,
    
    quintile_relg_diff_residence, quintile_relg_diff_job_loc
    
  )



write.csv(analysis_df_subset, "./BKN survey/apsr_kuipers_replication_file/_3_data/main_analysis_data.csv")
